library(tidyverse)
library(cluster)
library(factoextra)
library(dendextend)
library(broom)
library(animation)
computers
NA
computers %>%
  ggplot(aes(x = hd, y = ram)) +
  geom_point()

in answer to the suitability to cluster…. I dunno, kinda…..Logic is telling me yes otherwise what’s the point in the hiomework but looking at the data it looks a bit organised and non linear. So….who knows?? Yes i suppose it is, there’s visually soe grouping on the y axis (RAM) around like 4, 8, 16 (GB?) with a wider spread on the x axis.

computers_clean <- computers %>%
  select(c(ram, hd))

computers_clean
summary(computers_clean)
      ram               hd        
 Min.   : 2.000   Min.   :  80.0  
 1st Qu.: 4.000   1st Qu.: 214.0  
 Median : 8.000   Median : 340.0  
 Mean   : 8.287   Mean   : 416.6  
 3rd Qu.: 8.000   3rd Qu.: 528.0  
 Max.   :32.000   Max.   :2100.0  

First i’m gonna scale the data.

computers_scale <- computers_clean %>%
  mutate_all(scale)
computers_scale
clustered_computers <- kmeans(computers_scale, centers =  4, nstart = 25)

clustered_computers
K-means clustering with 4 clusters of sizes 419, 904, 2235, 2701

Cluster means:
         ram         hd
1  2.4276068  2.5442566
2  1.3728674  0.5699667
3 -0.1367705  0.2706146
4 -0.7229016 -0.8093732

Clustering vector:
   [1] 4 4 4 4 2 2 4 4 4 4 4 4 4 4 4 4 4 4 4 3 4 4 2 4 3 4 4 3 2 4 4 4 4 4 4 4 4 3 4 4 4 3 4 4
  [45] 2 4 4 4 4 4 4 4 3 4 4 4 4 4 4 4 3 4 4 4 4 4 4 4 4 4 3 4 4 4 4 4 4 3 4 4 4 4 4 4 4 4 4 4
  [89] 4 3 4 4 4 4 2 4 4 4 2 4 4 4 4 4 4 4 4 4 4 4 3 4 3 2 4 4 4 4 4 4 4 4 4 3 4 4 4 4 4 4 4 4
 [133] 4 4 4 4 4 4 4 4 4 3 4 4 3 4 4 4 4 3 4 4 3 4 4 4 4 4 4 4 4 4 4 4 2 4 4 4 4 4 4 4 4 4 3 4
 [177] 4 4 4 4 4 4 4 4 3 4 4 4 2 2 4 4 4 4 4 4 4 4 4 3 3 4 4 4 4 4 4 3 4 3 4 4 4 4 4 4 4 4 2 4
 [221] 4 4 4 3 4 4 4 2 4 4 4 4 4 4 4 4 4 4 4 3 4 4 4 4 4 4 2 4 4 4 4 4 4 4 4 4 4 4 4 2 3 4 4 4
 [265] 4 4 4 4 2 4 4 3 4 2 4 2 4 4 4 4 4 4 4 4 4 4 4 4 2 4 3 4 4 4 4 2 4 2 4 4 4 4 4 2 4 4 4 4
 [309] 4 3 4 4 2 3 4 4 4 4 4 4 4 2 2 4 4 4 4 4 2 4 4 4 4 4 4 2 4 2 4 4 4 4 3 4 4 4 4 4 4 4 4 4
 [353] 4 4 4 4 2 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 2 2 4 4 4 4 2 4 4 4 4 4 4 4 4 4 4 4 4 2
 [397] 4 4 4 3 3 4 3 2 2 4 4 4 3 2 4 4 4 4 2 4 4 4 3 4 2 2 4 4 4 4 2 3 4 3 2 2 2 4 3 4 4 4 4 2
 [441] 4 4 4 4 3 2 4 4 4 4 4 4 4 4 4 4 4 4 4 2 4 4 3 2 4 4 3 4 3 3 4 4 4 3 4 4 4 4 4 4 4 4 4 4
 [485] 4 3 4 3 4 3 1 3 4 3 3 4 2 4 4 4 4 4 4 3 4 4 3 2 3 3 3 4 2 4 4 4 4 4 4 4 4 3 4 4 4 2 3 4
 [529] 4 4 3 4 2 4 4 4 4 4 3 4 2 4 4 4 3 3 4 3 4 4 3 4 2 4 2 4 4 2 2 4 2 4 4 4 4 4 4 4 4 3 4 4
 [573] 4 4 4 4 4 2 3 4 4 3 3 4 3 4 2 4 4 4 4 4 2 4 4 4 3 3 4 4 4 2 4 2 4 4 4 4 4 4 2 3 3 3 4 1
 [617] 4 4 4 4 4 4 3 4 4 3 3 4 4 4 4 4 4 4 4 4 4 4 4 3 4 4 4 2 4 2 2 3 4 4 3 3 3 3 4 4 4 4 4 4
 [661] 2 4 4 4 4 3 2 4 4 3 4 2 3 2 4 3 4 4 3 4 4 4 3 3 4 3 3 3 4 3 4 4 4 2 4 4 3 4 2 4 4 4 4 2
 [705] 4 4 2 4 4 4 2 4 4 3 4 4 2 2 3 3 1 4 4 4 4 4 4 2 3 4 4 4 4 4 4 4 4 4 2 2 2 4 4 4 4 2 3 4
 [749] 4 3 4 3 4 2 4 4 4 4 4 4 4 4 4 2 4 4 2 4 4 3 3 4 2 2 2 3 4 4 2 3 4 4 2 3 4 4 4 3 4 2 4 2
 [793] 4 3 4 3 2 4 2 3 4 2 4 3 4 3 4 3 4 4 3 4 4 2 4 4 3 4 4 4 3 3 4 3 2 4 4 4 4 2 3 4 4 4 4 4
 [837] 2 3 4 4 3 4 3 2 3 4 4 4 4 4 4 3 3 4 2 3 4 4 4 3 3 4 4 4 3 4 4 3 3 3 4 4 3 4 4 3 4 4 4 4
 [881] 4 2 4 3 2 4 4 2 4 4 3 2 4 4 3 4 2 2 4 3 1 3 2 4 3 4 4 4 2 4 2 4 4 4 4 4 4 4 4 4 3 4 3 4
 [925] 4 4 4 4 3 4 4 2 4 4 4 3 3 3 4 4 4 2 4 4 3 3 4 3 4 4 2 3 4 4 2 3 4 3 4 2 3 4 3 4 3 2 4 4
 [969] 4 3 3 3 3 4 4 3 4 4 4 4 4 1 2 4 3 3 4 4 4 4 4 4 4 1 2 3 3 4 3 4
 [ reached getOption("max.print") -- omitted 5259 entries ]

Within cluster sum of squares by cluster:
[1] 452.8436 286.4189 787.9572 423.3334
 (between_SS / total_SS =  84.4 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss" "betweenss"   
[7] "size"         "iter"         "ifault"      

Next i’m going to use

Tidy

Glance

Augments

to have a look at the clusters and the data contained within in a bit more detail.

Here i’m looking at info about the mean and the size of each cluster.

tidy(clustered_computers, 
     col.names = colnames(computers_scale))

Now i’m looking at info about the full clustering.

glance(clustered_computers)

This function shows me which cluster each data point has been added to. (comparing the new data i’ve clustered with the old data. )

augment(clustered_computers, computers)
computers_scale %>%
  kmeans.ani(centers = 4)

Now i’m going to look at how many clusters i need in order to have the max heterogenity.

max_k <- 20

k_clusters <- tibble(k = 1:max_k) %>%
  mutate(
    kclust = map(k, ~ kmeans(computers_scale, .x, nstart = 25)),
    tidied = map(kclust, tidy),
    glanced = map(kclust, glance),
    augmented = map(kclust, augment, computers)
  )

k_clusters
NA

unpacking the tibble to have a look at the tot.withinss which is my key variable.

comp_clusterings <- k_clusters %>%
  unnest(glanced)

comp_clusterings
ggplot(comp_clusterings, aes(x=k, y=tot.withinss)) +
  geom_point() +
    geom_line() +
    scale_x_continuous(breaks = seq(1, 20, by = 1))

as i predicted….4. so the plot. below is as good as it gets.

computers_scale %>%
  kmeans.ani(centers = 4)

comp_clusterings %>% 
  unnest(cols = c(augmented)) %>%
  filter(k <= 4) %>%
 ggplot(aes(x = ram, y = hd)) +
  geom_point(aes(color = .cluster)) + 
  facet_wrap(~ k)

still unsure if clustering was suitable given the weird distribution. k = 3 might have been an option too.

LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9CmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KGNsdXN0ZXIpCmxpYnJhcnkoZmFjdG9leHRyYSkKbGlicmFyeShkZW5kZXh0ZW5kKQpsaWJyYXJ5KGJyb29tKQpsaWJyYXJ5KGFuaW1hdGlvbikKYGBgCgpgYGB7cn0KY29tcHV0ZXJzCgpgYGAKCmBgYHtyfQpjb21wdXRlcnMgJT4lCiAgZ2dwbG90KGFlcyh4ID0gaGQsIHkgPSByYW0pKSArCiAgZ2VvbV9wb2ludCgpCmBgYAppbiBhbnN3ZXIgdG8gdGhlIHN1aXRhYmlsaXR5IHRvIGNsdXN0ZXIuLi4uIEkgZHVubm8sIGtpbmRhLi4uLi5Mb2dpYyBpcyB0ZWxsaW5nIG1lIHllcyBvdGhlcndpc2Ugd2hhdCdzIHRoZSBwb2ludCBpbiB0aGUgaGlvbWV3b3JrIGJ1dCBsb29raW5nIGF0IHRoZSBkYXRhIGl0IGxvb2tzIGEgYml0IG9yZ2FuaXNlZCBhbmQgbm9uIGxpbmVhci4gIFNvLi4uLndobyBrbm93cz8/IFllcyBpIHN1cHBvc2UgaXQgaXMsIHRoZXJlJ3MgdmlzdWFsbHkgc29lIGdyb3VwaW5nIG9uIHRoZSB5IGF4aXMgKFJBTSkgYXJvdW5kIGxpa2UgNCwgOCwgMTYgKEdCPykgd2l0aCBhIHdpZGVyIHNwcmVhZCBvbiB0aGUgeCBheGlzLgoKYGBge3J9CmNvbXB1dGVyc19jbGVhbiA8LSBjb21wdXRlcnMgJT4lCiAgc2VsZWN0KGMocmFtLCBoZCkpCgpjb21wdXRlcnNfY2xlYW4KYGBgCmBgYHtyfQpzdW1tYXJ5KGNvbXB1dGVyc19jbGVhbikKYGBgCgoKRmlyc3QgaSdtIGdvbm5hIHNjYWxlIHRoZSBkYXRhLiAgIAoKYGBge3J9CmNvbXB1dGVyc19zY2FsZSA8LSBjb21wdXRlcnNfY2xlYW4gJT4lCiAgbXV0YXRlX2FsbChzY2FsZSkKYGBgCgpgYGB7cn0KY29tcHV0ZXJzX3NjYWxlCmBgYAoKYGBge3J9CmNsdXN0ZXJlZF9jb21wdXRlcnMgPC0ga21lYW5zKGNvbXB1dGVyc19zY2FsZSwgY2VudGVycyA9ICA0LCBuc3RhcnQgPSAyNSkKCmNsdXN0ZXJlZF9jb21wdXRlcnMKYGBgCk5leHQgaSdtIGdvaW5nIHRvIHVzZSAKClRpZHkKCkdsYW5jZSAKCkF1Z21lbnRzCgp0byBoYXZlIGEgbG9vayBhdCB0aGUgY2x1c3RlcnMgYW5kIHRoZSBkYXRhIGNvbnRhaW5lZCB3aXRoaW4gaW4gYSBiaXQgbW9yZSBkZXRhaWwuIAoKSGVyZSBpJ20gbG9va2luZyBhdCBpbmZvIGFib3V0IHRoZSBtZWFuIGFuZCB0aGUgc2l6ZSBvZiBlYWNoIGNsdXN0ZXIuIApgYGB7cn0KdGlkeShjbHVzdGVyZWRfY29tcHV0ZXJzLCAKICAgICBjb2wubmFtZXMgPSBjb2xuYW1lcyhjb21wdXRlcnNfc2NhbGUpKQpgYGAKTm93IGknbSBsb29raW5nIGF0IGluZm8gYWJvdXQgdGhlIGZ1bGwgY2x1c3RlcmluZy4gCgpgYGB7cn0KZ2xhbmNlKGNsdXN0ZXJlZF9jb21wdXRlcnMpCmBgYApUaGlzIGZ1bmN0aW9uIHNob3dzIG1lIHdoaWNoIGNsdXN0ZXIgZWFjaCBkYXRhIHBvaW50IGhhcyBiZWVuIGFkZGVkIHRvLiAgKGNvbXBhcmluZyB0aGUgbmV3IGRhdGEgaSd2ZSBjbHVzdGVyZWQgd2l0aCB0aGUgb2xkIGRhdGEuICkKYGBge3J9CmF1Z21lbnQoY2x1c3RlcmVkX2NvbXB1dGVycywgY29tcHV0ZXJzKQpgYGAKCmBgYHtyfQpjb21wdXRlcnNfc2NhbGUgJT4lCiAga21lYW5zLmFuaShjZW50ZXJzID0gNCkKYGBgCk5vdyBpJ20gZ29pbmcgdG8gbG9vayBhdCBob3cgbWFueSBjbHVzdGVycyBpIG5lZWQgaW4gb3JkZXIgdG8gaGF2ZSB0aGUgbWF4IGhldGVyb2dlbml0eS4gCgpgYGB7cn0KbWF4X2sgPC0gMjAKCmtfY2x1c3RlcnMgPC0gdGliYmxlKGsgPSAxOm1heF9rKSAlPiUKICBtdXRhdGUoCiAgICBrY2x1c3QgPSBtYXAoaywgfiBrbWVhbnMoY29tcHV0ZXJzX3NjYWxlLCAueCwgbnN0YXJ0ID0gMjUpKSwKICAgIHRpZGllZCA9IG1hcChrY2x1c3QsIHRpZHkpLAogICAgZ2xhbmNlZCA9IG1hcChrY2x1c3QsIGdsYW5jZSksCiAgICBhdWdtZW50ZWQgPSBtYXAoa2NsdXN0LCBhdWdtZW50LCBjb21wdXRlcnMpCiAgKQoKa19jbHVzdGVycwoKYGBgCnVucGFja2luZyB0aGUgdGliYmxlIHRvIGhhdmUgYSBsb29rIGF0IHRoZSB0b3Qud2l0aGluc3Mgd2hpY2ggaXMgbXkga2V5IHZhcmlhYmxlLiAKYGBge3J9CmNvbXBfY2x1c3RlcmluZ3MgPC0ga19jbHVzdGVycyAlPiUKICB1bm5lc3QoZ2xhbmNlZCkKCmNvbXBfY2x1c3RlcmluZ3MKYGBgCgpgYGB7cn0KZ2dwbG90KGNvbXBfY2x1c3RlcmluZ3MsIGFlcyh4PWssIHk9dG90LndpdGhpbnNzKSkgKwogIGdlb21fcG9pbnQoKSArCiAgICBnZW9tX2xpbmUoKSArCiAgICBzY2FsZV94X2NvbnRpbnVvdXMoYnJlYWtzID0gc2VxKDEsIDIwLCBieSA9IDEpKQpgYGAKCmFzIGkgcHJlZGljdGVkLi4uLjQuIHNvIHRoZSBwbG90LiBiZWxvdyBpcyBhcyBnb29kIGFzIGl0IGdldHMuIAoKYGBge3J9CmNvbXB1dGVyc19zY2FsZSAlPiUKICBrbWVhbnMuYW5pKGNlbnRlcnMgPSA0KQpgYGAKCmBgYHtyfQpjb21wX2NsdXN0ZXJpbmdzICU+JSAKICB1bm5lc3QoY29scyA9IGMoYXVnbWVudGVkKSkgJT4lCiAgZmlsdGVyKGsgPD0gNCkgJT4lCiBnZ3Bsb3QoYWVzKHggPSByYW0sIHkgPSBoZCkpICsKICBnZW9tX3BvaW50KGFlcyhjb2xvciA9IC5jbHVzdGVyKSkgKyAKICBmYWNldF93cmFwKH4gaykKYGBgCgpzdGlsbCB1bnN1cmUgaWYgY2x1c3RlcmluZyB3YXMgc3VpdGFibGUgZ2l2ZW4gdGhlIHdlaXJkIGRpc3RyaWJ1dGlvbi4gIGsgPSAzIG1pZ2h0IGhhdmUgYmVlbiBhbiBvcHRpb24gdG9vLiA=